revised version of mini-project 02 goes here
Including tidyverse, sf, and plotly libraries. Assigning dataset, and adjusting column names for ease of creating plots later.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.3.2, PROJ 7.2.1; sf_use_s2() is TRUE
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
housing_data <- read_csv("https://raw.githubusercontent.com/reisanar/datasets/master/WestRoxbury.csv")
## Rows: 5802 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): REMODEL
## dbl (13): TOTAL VALUE, TAX, LOT SQFT, YR BUILT, GROSS AREA, LIVING AREA, FLO...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(housing_data) <- make.names(names(housing_data), unique=TRUE)
housing_data
## # A tibble: 5,802 × 14
## TOTAL.VALUE TAX LOT.SQFT YR.BUILT GROSS.AREA LIVING.AREA FLOORS ROOMS
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 344. 4330 9965 1880 2436 1352 2 6
## 2 413. 5190 6590 1945 3108 1976 2 10
## 3 330. 4152 7500 1890 2294 1371 2 8
## 4 499. 6272 13773 1957 5032 2608 1 9
## 5 332. 4170 5000 1910 2370 1438 2 7
## 6 337. 4244 5142 1950 2124 1060 1 6
## 7 359. 4521 5000 1954 3220 1916 2 7
## 8 320. 4030 10000 1950 2208 1200 1 6
## 9 334. 4195 6835 1958 2582 1092 1 5
## 10 409. 5150 5093 1900 4818 2992 2 8
## # … with 5,792 more rows, and 6 more variables: BEDROOMS <dbl>,
## # FULL.BATH <dbl>, HALF.BATH <dbl>, KITCHEN <dbl>, FIREPLACE <dbl>,
## # REMODEL <chr>
Assigning florida lakes shapefile data.
florida_lakes <- read_sf("data/Florida_Lakes/Florida_Lakes.shp")
florida_lakes
## Simple feature collection with 4243 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -87.42774 ymin: 25.02625 xmax: -80.03097 ymax: 31.00254
## Geodetic CRS: WGS 84
## # A tibble: 4,243 × 7
## PERIMETER NAME COUNTY OBJECTID SHAPEAREA SHAPELEN geometry
## <dbl> <chr> <chr> <int> <dbl> <dbl> <MULTIPOLYGON [°]>
## 1 11082. Lake … ORANGE 1 1818000. 11082. (((-81.34813 28.62354, -…
## 2 2834. Black… ESCAM… 2 31380. 2834. (((-87.42029 30.49087, -…
## 3 18768. Lake … HIGHL… 3 13601177. 18768. (((-81.4614 27.46472, -8…
## 4 493. Halfm… ESCAM… 4 6337. 493. (((-87.3131 30.74034, -8…
## 5 5663. Cresc… ESCAM… 5 338242. 5663. (((-87.27591 30.4692, -8…
## 6 317. Black… SANTA… 6 2380. 317. (((-87.26869 30.69546, -…
## 7 181. Beave… ESCAM… 7 1381. 181. (((-87.27064 30.70558, -…
## 8 1376. Salte… ESCAM… 8 24421. 1376. (((-87.26273 30.94937, -…
## 9 1914. Forty… SANTA… 9 178663. 1914. (((-87.18693 30.81357, -…
## 10 328. Hutso… SANTA… 10 7838. 328. (((-87.14079 30.96851, -…
## # … with 4,233 more rows
Assigning florida state boundary shapfile. Data source.
florida_shape <- read_sf("data/Detailed_Florida_State_Boundary/Detailed_Florida_State_Boundary.shp")
florida_shape
## Simple feature collection with 1 feature and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -87.63493 ymin: 24.52104 xmax: -80.03132 ymax: 31.0002
## Geodetic CRS: WGS 84
## # A tibble: 1 × 5
## OBJECTID ID Shape_STAr Shape_STLe geometry
## <int> <int> <dbl> <dbl> <MULTIPOLYGON [°]>
## 1 1 1 145697142592 7614166. (((-85.54423 30.00015, -85.54407 29.99…
Filtering lakes dataset to only orange county for ease of handling and further exploration.
florida_lakes_orange = filter(florida_lakes, COUNTY %in% c("ORANGE"))
florida_lakes_orange
## Simple feature collection with 365 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -81.69661 ymin: 28.34709 xmax: -80.95306 ymax: 28.78791
## Geodetic CRS: WGS 84
## # A tibble: 365 × 7
## PERIMETER NAME COUNTY OBJECTID SHAPEAREA SHAPELEN geometry
## * <dbl> <chr> <chr> <int> <dbl> <dbl> <MULTIPOLYGON [°]>
## 1 11082. Lake … ORANGE 1 1818000. 11082. (((-81.34813 28.62354, -…
## 2 13601. Lake … ORANGE 69 3680008. 13601. (((-81.51848 28.49421, -…
## 3 18261. Big S… ORANGE 70 4131164. 18261. (((-81.48132 28.40898, -…
## 4 979. Lake … ORANGE 324 66933. 979. (((-81.50882 28.60936, -…
## 5 492. Lake … ORANGE 329 16645. 492. (((-81.49032 28.58921, -…
## 6 6288. Crook… ORANGE 330 433676. 6288. (((-81.48208 28.5978, -8…
## 7 547. Carp … ORANGE 331 19872. 547. (((-81.50163 28.65873, -…
## 8 1193. Lake … ORANGE 332 98047. 1193. (((-81.53446 28.55185, -…
## 9 1727. Lake … ORANGE 334 227921. 1727. (((-81.5 28.39675, -81.4…
## 10 1048. Geyer… ORANGE 337 67286. 1048. (((-81.48597 28.53726, -…
## # … with 355 more rows
Generating scatterplot to explore data and find outliers. I used plotly to identify data points that were outliers.
perimeter_area <- ggplot(florida_lakes_orange, aes(x = PERIMETER, y = SHAPEAREA, text = NAME)) + geom_point()
ggplotly(perimeter_area, tooltip = "text")
Filtering outliers out of dataset
florida_lakes_orange_outlierless = filter(florida_lakes_orange, !(NAME %in% c("Lake Apopka", "Johns Lake")))
Creating a model to compare lake perimeter to area.
perimeter_vs_area <- lm(PERIMETER ~ SHAPEAREA, data = florida_lakes_orange_outlierless)
summary(perimeter_vs_area)
##
## Call:
## lm(formula = PERIMETER ~ SHAPEAREA, data = florida_lakes_orange_outlierless)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7966.6 -605.5 -292.0 206.3 6973.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.222e+03 7.440e+01 16.42 <2e-16 ***
## SHAPEAREA 3.357e-03 8.449e-05 39.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1307 on 361 degrees of freedom
## Multiple R-squared: 0.8139, Adjusted R-squared: 0.8133
## F-statistic: 1578 on 1 and 361 DF, p-value: < 2.2e-16
library(broom)
tidy(perimeter_vs_area)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1222. 74.4 16.4 1.23e- 45
## 2 SHAPEAREA 0.00336 0.0000845 39.7 7.51e-134
glance(perimeter_vs_area)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.814 0.813 1307. 1578. 7.51e-134 1 -3119. 6243. 6255.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Creating an interactive plot to compare total value, gross area, and number of rooms for West Roxbury data.
housing_plot <- ggplot(housing_data, aes(x = GROSS.AREA, y = TOTAL.VALUE, color = as.factor(ROOMS))) + geom_point() +
theme(plot.title.position = "plot") +
labs(title = "Total Value of Homes vs Total Area",
subtitle = "for houses in the West Roxbury neighborhood",
color = "Number of Rooms") +
xlab("Total Area") +
ylab("Total Value")
ggplotly(housing_plot)
Creating an interactive map of the lakes of Florida, using the Florida Lakes shapefile data, and detailed state boundary shapfile data.
lake_plot <- ggplot() +
geom_sf(color = "#70DB92", fill = "#98F5B4", data = florida_shape) + theme(legend.position = "none") + theme_classic() + geom_sf(color = "#54A4F0", fill = "#498FD1", data = florida_lakes, aes(text = NAME)) +
theme(plot.title.position = "plot") +
labs(title = "Lakes in the State of Florida",
subtitle = "hover over each lake to see its name")
## Warning: Ignoring unknown aesthetics: text
ggplotly(lake_plot, tooltip = "text")